# Load packages
library(here)
library(dplyr)
library(readr)
library(rstan)
library(bayesrules)
library(tidyverse)
library(bayesplot)
library(rstanarm)
library(janitor)
library(tidybayes)
library(broom.mixed)
library(here)
library(sf)
library(tidycensus)
library(openxlsx)
library(s2)
# themes
theme_set(theme_minimal())
vari_names <- read_csv(here("clean_data", "nyc_names.csv"))
nyc_clean <- st_read(here("clean_data", "nyc_data.shp"), crs = 4269) 
## Reading layer `nyc_data' from data source 
##   `/Users/freddy/Documents/GitHub/454/clean_data/nyc_data.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 223 features and 24 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.04731 ymin: 40.55685 xmax: -73.70002 ymax: 40.91758
## Geodetic CRS:  NAD83
colnames(nyc_clean) <- colnames(vari_names)
subway_stations <- st_read(here("ethnic","Data","stations", "geo_export_85568705-efba-4456-bdc0-3d70ff2cf8e5.shp")) %>%
  st_transform(., 4269)
## Reading layer `geo_export_85568705-efba-4456-bdc0-3d70ff2cf8e5' from data source 
##   `/Users/freddy/Documents/GitHub/454/ethnic/Data/stations/geo_export_85568705-efba-4456-bdc0-3d70ff2cf8e5.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 473 features and 5 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -74.03088 ymin: 40.57603 xmax: -73.7554 ymax: 40.90313
## Geodetic CRS:  WGS84(DD)
bus_stations <- st_read(here("ethnic","Data","bus", "bus_stops_nyc_may2020.shp")) %>%
  st_transform(., 4269)
## Reading layer `bus_stops_nyc_may2020' from data source 
##   `/Users/freddy/Documents/GitHub/454/ethnic/Data/bus/bus_stops_nyc_may2020.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 14663 features and 6 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 914169.1 ymin: 122626.8 xmax: 1066982 ymax: 271696.8
## Projected CRS: NAD83 / New York Long Island (ftUS)
transit_points <- read_csv(here("transit","ridership_points.csv"))%>%
  separate(Position, into=c("Point", "longitude", "latitude"), " ") %>%
  mutate(latitude = str_remove_all(latitude, "[)]"),
         longitude = str_remove_all(longitude, "[()]"),
         ) %>%
  dplyr::select(-c(Point)) %>% 
  mutate(latitude = as.numeric(latitude),
         longitude = as.numeric(longitude)) %>%
  st_as_sf(coords = c("longitude", "latitude"), crs = 4269)
#plot locations over map
subway_loc <- ggplot() +
  geom_sf(data = nyc_clean, fill = "#EBF6FF", color = "#D48DD8", size = 0.15, alpha = .8) +
  geom_sf(data = subway_stations, color="#3F123C", size=1) + 
  coord_sf(datum = st_crs(subway_stations)) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Subway Stop Locations \nin NYC")+ 
    theme(#panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

bus_loc <- ggplot() +
  geom_sf(data = nyc_clean, fill = "#EBF6FF", color = "#D48DD8", size = 0.15, alpha = .8) +
  geom_sf(data = bus_stations, color="#3F123C", size=.5, alpha=.5) + 
  coord_sf(datum = st_crs(subway_stations)) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Bus Stop Locations \nin NYC")+ 
    theme(#panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))


stops <- nyc_clean %>%
  ggplot() +
  geom_sf(aes(fill = sub_count), color = "#8f98aa") +
  scale_fill_gradient(low= "lavender", high = "maroon",
                      guide = guide_legend(title = "Number of Subway Stops") ,na.value="#D6D6D6") +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Subway Stop Counts \nin NYC")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

bus_stops <- nyc_clean %>%
  ggplot() +
  geom_sf(aes(fill = bus_count), color = "#8f98aa") +
  scale_fill_gradient(low= "lavender", high = "maroon",
                      guide = guide_legend(title = "Number of Bus Stops") ,na.value="#D6D6D6") +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Bus Stop Counts \nin NYC")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))


ridership <- nyc_clean%>%
  ggplot() +
  geom_sf(aes(fill = log2(mean_ridership)), color = "#8f98aa") +
  scale_fill_gradient(low= "lavender", high = "maroon",
                      guide = guide_legend(title = "Log2 Mean Ridership") ,na.value="#D6D6D6") +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Mean (Log2) Subway Turnstile \nRidership in 2018 \nfor NYC")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))


access <- nyc_clean %>%
  ggplot() +
  geom_sf(aes(fill = perc_covered_by_transit), color = "#8f98aa") +
  scale_fill_gradient(low= "lavender", high = "maroon",
                      guide = guide_legend(title = "Percent of Neighborhood within \n walking distance of a transit station"), na.value="#D6D6D6") +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Transit Accessibility \nin NYC")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

`

red <- ggplot(nyc_clean) +
  geom_sf(aes(fill = below_poverty_line_count), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#E13728", guide = guide_legend(title = "Number Below \nPoverty Line")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Impoverishement")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

yellow <- ggplot(nyc_clean) +
  geom_sf(aes(fill = mean_income), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#F3D24E", guide = guide_legend(title = "Mean Income")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Mean Income")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12),
          legend.text = element_text(size = 12)) +
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

teal <- ggplot(nyc_clean) +
  geom_sf(aes(fill = mean_rent), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#2DBDC7", guide = guide_legend(title = "Dollars")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Mean Rent")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12),
          legend.text = element_text(size = 12)) +
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

purple <- ggplot(nyc_clean) +
  geom_sf(aes(fill = eviction_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#7826C0", guide = guide_legend(title = "Number of Evictions")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Evictions")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

orange <- ggplot(nyc_clean) +
  geom_sf(aes(fill = unemployment_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#FC9228", guide = guide_legend(title = "Number on \nUnemployment")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Unemployment")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12),
          legend.text = element_text(size = 12)) +
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

green <- ggplot(nyc_clean) +
  geom_sf(aes(fill = store_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#326902", guide = guide_legend(title = "Number of Stores")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Retail Food Stores")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

blue <- ggplot(nyc_clean) +
  geom_sf(aes(fill = school_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#5372C4", 
                      guide = guide_legend(title = "Number of Schools")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Number of Schools")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

pink <- ggplot(nyc_clean) +
  geom_sf(aes(fill = total_pop), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#F450E1", guide = guide_legend(title = "Number of People")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))


brown <- ggplot(nyc_clean) +
  geom_sf(aes(fill = uninsured_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#F8E3DD", high = "#6A4D39", guide = guide_legend(title = "Number of People \n without Insurance Coverage")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Insurance Coverage")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))
white <- ggplot(nyc_clean) +
  geom_sf(aes(fill = white_count), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#7B435B", guide = guide_legend(title = "Number White")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("White Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

black <- ggplot(nyc_clean) +
  geom_sf(aes(fill = black_count), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#F25F5C", guide = guide_legend(title = "Number Black")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Black Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

asian <- ggplot(nyc_clean) +
  geom_sf(aes(fill = asian_count), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#717EC3", guide = guide_legend(title = "Number Asian")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Asian Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

latinx <- ggplot(nyc_clean) +
  geom_sf(aes(fill = latinx_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#FC9A38", guide = guide_legend(title = "Number Latinx")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Latinx Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

Data Introduction

All the data used in this project are from two major sources: the Tidycensus package and NYC Open Data.

Tidycensus is an R package interface, developed by Kyle Walker and Matt Herman, that enables easy access to the US Census Bureau’s data APIs and returns Tidyverse-ready data frames from various major US Census Bureau datasets. Our demographic and socioeconomic data are drawn from the American Community Survey results found in Tidycensus package. A summary of our ACS data variables is below:

  • total_pop: Total Population by Neighborhood

  • mean_income: Mean Income by Neighborhood

  • below_poverty_line_count: Number of People Living Below the 100% Poverty Line by Neighborhood

  • mean_rent: Mean Rent by Neighborhood

  • unemployment_count: Number of People on Unemployment by Neighborhood

  • latinx_count: Number of Latinx People by Neighborhood

  • white_count: Number of White People by Neighborhood

  • black_count: Number of Black People by Neighborhood

  • native_count: Number of Native People by Neighborhood

  • asian_count: Number of Asian People by Neighborhood

  • naturalized_citizen_count: Number of Naturalized Citizens by Neighborhood

  • noncitizen_count: Number of Foreign Born People by Neighborhood

  • uninsured_count: Number of Uninsured Citizens of any Age by Neighborhood

For remaining predictors, we used NYC Open Data’s portal to identify specific predictors. In particular, we used geotagged locations of Subway Stops, Bus Stops, Grocery Stores, Schools, and Eviction Sites from the Departments of Transportation, Health, Education, and Housing to calculate neighborhood-specific variables described below:

  • school_count: Number of Public Schools by Neighborhood
  • eviction_count: Number of Evictions by Neighborhood
  • store_count: Number of Grocery Stores and Food Vendors by Neighborhood
  • sub_count: Number of Subway Stations by Neighborhood
  • bus_count: Number of Bus Stations by Neighborhood
  • perc_covered_by_transit: Percent of Neighborhood within walking distance (.25 miles) of any transit stop.

Lastly, we acquired subway ridership from Metropolitan Transportation Authority’s turnstile data for the week of September 7, 2019. For each station, entry/exit of each turnstile is recorded. Then, we aggregated this information by taking the station-specific average of subway ridership across the 7 days in the week. Finally, we geotagged each listed station, then took the mean of ridership at all subway stations in each neighborhood.

Data Summaries

Our data has 242 observations of 23 variables. Below is a preview of our dataset with colnames attached.

library(kableExtra)
dim(nyc_clean)
## [1] 223  25
kable(head(nyc_clean)) %>% kable_styling() 
nta_id total_pop mean_income below_poverty_line_count below_poverty_line_and_50_count mean_rent unemployment_count latinx_count white_count black_count native_count asian_count naturalized_citizen_count noncitizen_count uninsured_count school_count eviction_count store_count sub_count bus_count mean_ridership perc_covered_by_transit transportation_desert_3cat transportation_desert_4cat geometry
BK0101 26308 98338.67 2397 1289 2062.667 582 3284 20526 482 40 1052 3777 3129 1797 6 68 71 2 53 9410.500 28.36395 Satisfactory Access Limited Access MULTIPOLYGON (((-73.94074 4…
BK0102 57774 101238.92 9120 4474 2330.077 1710 18227 32237 1460 0 4008 6802 7746 3725 12 204 129 2 97 26603.000 55.26492 Satisfactory Access Satisfactory Access MULTIPOLYGON (((-73.96355 4…
BK0103 36891 30309.25 18285 5970 1194.875 457 3351 31799 1288 20 194 3548 1012 711 6 45 58 3 35 6348.667 87.71022 Satisfactory Access Satisfactory Access MULTIPOLYGON (((-73.96762 4…
BK0104 41861 78746.25 8406 3876 1801.667 1678 13602 17682 3960 210 5515 5770 5325 3040 15 157 117 6 77 8006.000 63.00854 Satisfactory Access Satisfactory Access MULTIPOLYGON (((-73.95083 4…
BK0201 23758 140543.00 1504 585 2275.833 579 1517 17643 1330 44 2041 2082 1448 852 1 25 30 2 18 5275.000 98.46067 Satisfactory Access Satisfactory Access MULTIPOLYGON (((-73.99066 4…
BK0202 24603 132850.00 3776 970 2413.875 1195 3772 13288 3369 0 2893 2151 2728 1244 17 111 73 8 104 19444.000 159.13596 Excellent Access Excellent Access MULTIPOLYGON (((-73.99327 4…

Below is a summary of each variable’s distribution.

summary(nyc_clean) 
##     nta_id            total_pop      mean_income     below_poverty_line_count
##  Length:223         Min.   :    0   Min.   : 23149   Min.   :    0           
##  Class :character   1st Qu.:18106   1st Qu.: 50624   1st Qu.: 1544           
##  Mode  :character   Median :31876   Median : 67335   Median : 4180           
##                     Mean   :32357   Mean   : 72094   Mean   : 5905           
##                     3rd Qu.:47184   3rd Qu.: 87155   3rd Qu.: 8785           
##                     Max.   :97786   Max.   :211822   Max.   :28755           
##                                     NA's   :42                               
##  below_poverty_line_and_50_count   mean_rent    unemployment_count
##  Min.   :    0.0                 Min.   : 792   Min.   :   0.0    
##  1st Qu.:  830.5                 1st Qu.:1321   1st Qu.: 423.5    
##  Median : 2693.0                 Median :1510   Median : 921.0    
##  Mean   : 3091.2                 Mean   :1603   Mean   :1084.3    
##  3rd Qu.: 4652.0                 3rd Qu.:1746   3rd Qu.:1534.0    
##  Max.   :13934.0                 Max.   :3279   Max.   :4697.0    
##                                  NA's   :42                       
##   latinx_count    white_count     black_count     native_count   
##  Min.   :    0   Min.   :    0   Min.   :    0   Min.   :  0.00  
##  1st Qu.: 1970   1st Qu.:  481   1st Qu.:  469   1st Qu.:  0.00  
##  Median : 5558   Median : 5010   Median : 2013   Median : 28.00  
##  Mean   : 9660   Mean   : 9872   Mean   : 7283   Mean   : 56.68  
##  3rd Qu.:13914   3rd Qu.:14762   3rd Qu.: 9138   3rd Qu.: 72.00  
##  Max.   :60712   Max.   :69259   Max.   :69697   Max.   :601.00  
##                                                                  
##   asian_count      naturalized_citizen_count noncitizen_count uninsured_count  
##  Min.   :    0.0   Min.   :    0             Min.   :    0    Min.   :    0.0  
##  1st Qu.:  265.5   1st Qu.: 2662             1st Qu.: 1554    1st Qu.:  704.5  
##  Median : 2363.0   Median : 6414             Median : 4625    Median : 1920.0  
##  Mean   : 4530.3   Mean   : 6930             Mean   : 5237    Mean   : 2452.1  
##  3rd Qu.: 6054.5   3rd Qu.: 9768             3rd Qu.: 7435    3rd Qu.: 3376.0  
##  Max.   :41314.0   Max.   :31847             Max.   :25008    Max.   :12182.0  
##                                                                                
##   school_count    eviction_count    store_count       sub_count     
##  Min.   : 1.000   Min.   :   1.0   Min.   :  1.00   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.:  48.0   1st Qu.: 17.00   1st Qu.: 1.000  
##  Median : 5.000   Median : 148.0   Median : 45.00   Median : 1.000  
##  Mean   : 6.924   Mean   : 238.1   Mean   : 51.98   Mean   : 2.399  
##  3rd Qu.: 9.500   3rd Qu.: 371.0   3rd Qu.: 76.50   3rd Qu.: 3.000  
##  Max.   :31.000   Max.   :1126.0   Max.   :202.00   Max.   :17.000  
##                                                                     
##    bus_count      mean_ridership   perc_covered_by_transit
##  Min.   :  1.00   Min.   :   273   Min.   :  0.00000      
##  1st Qu.: 30.00   1st Qu.:  4984   1st Qu.:  0.00836      
##  Median : 49.00   Median :  8006   Median : 33.95065      
##  Mean   : 52.83   Mean   : 12051   Mean   : 45.42951      
##  3rd Qu.: 68.00   3rd Qu.: 14205   3rd Qu.: 69.10730      
##  Max.   :243.00   Max.   :109922   Max.   :254.34296      
##                   NA's   :100                             
##  transportation_desert_3cat transportation_desert_4cat          geometry  
##  Length:223                 Length:223                 MULTIPOLYGON :223  
##  Class :character           Class :character           epsg:4269    :  0  
##  Mode  :character           Mode  :character           +proj=long...:  0  
##                                                                           
##                                                                           
##                                                                           
## 

Data Visuals

library(egg)
ggarrange(subway_loc, bus_loc, stops, bus_stops, ridership, access, ncol=2)

ggarrange(red, orange, yellow, green, teal, blue, purple, pink, brown, ncol=3)

ggarrange(white, black, latinx, asian, ncol=2)

Model Building

Models 1 & 2: No Grouping

We fit ungrouped poisson and negative binomial models below:

poisson_non_hierarchical <- stan_glm(
  white_count ~  total_pop + mean_income + mean_rent + 
    unemployment_count + sub_count + 
    perc_covered_by_transit + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count,
  data = nyc_clean,
  family = poisson,
  chains = 4, iter = 500*2, seed = 84735, refresh = 0
)

negbin_non_hierarchical <- stan_glm(
  white_count ~  total_pop + mean_income + mean_rent + 
    unemployment_count + sub_count + 
    perc_covered_by_transit + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count,
  data = nyc_clean,
  family = neg_binomial_2,
  chains = 4, iter = 500*2, seed = 84735, refresh = 0
)
check_1 <- pp_check(poisson_non_hierarchical) + 
  xlab("White Resident Count") +
  labs(title = "Poisson")+
  xlim(0,100000)+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 
check_2 <- pp_check(negbin_non_hierarchical) + 
  xlab("White Resident Count") +
  labs(title = "Negative Binomial")+
  xlim(0,100000)+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

ggpubr::ggarrange(check_1, check_2, ncol=2, common.legend=TRUE, legend = "right")

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_poisson <-  posterior_predict(
  poisson_non_hierarchical, newdata = nyc_predict_clean)

predictions_negbin <-  posterior_predict(
  poisson_non_hierarchical, newdata = nyc_predict_clean)

library(tidybayes)
library(bayesplot)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
              prob_outer = 0.95) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
  xaxis_text(angle = 90, hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.y = element_blank(),
        axis.text.y = element_blank())+
  coord_flip()

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
              prob_outer = 0.95) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
    xaxis_text(angle = 90,  hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.y = element_blank(),
        axis.text.y = element_blank())+
  coord_flip()

Models 2 & 3: Hierarchy by Neighborhood

Poisson

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, \sigma_y & \sim \text{Pois}(\lambda_{ij}) \\ & \text{where} \log(\lambda_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c}, \beta_k & \stackrel{ind}{\sim} N(0,1) \\ \sigma_y, \sigma_0 & \stackrel{ind}{\sim} Exp(1) \end{split}\]

poisson_hierarchical <- stan_glmer(
 white_count ~  total_pop + mean_income + mean_rent + 
    unemployment_count + sub_count + 
    perc_covered_by_transit + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count + (1 | nta_id),
  data = nyc_clean,
  family = poisson,
  chains = 4, iter = 500*2, seed = 84735, refresh = 0
)
check_1 <- pp_check(poisson_hierarchical) + 
  xlab("White Resident Count") +
  labs(title = "Poisson")+
  xlim(0,100000)+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

library(kableExtra)

tidy(poisson_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.95) %>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Hierarchicahl Poisson - Model Summary") %>% 
  kable_styling()
Hierarchicahl Poisson - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 1054.7347298 0.3992025 449.1941244 2356.6935076
total_pop 0.0071261 0.0000090 0.0051391 0.0089718
eviction_count -0.3148467 0.0005230 -0.4155584 -0.2091430
uninsured_count -0.0164431 0.0000572 -0.0274059 -0.0051331
nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_poisson <-  posterior_predict(
  poisson_hierarchical, newdata = nyc_predict_clean)

library(tidybayes)
library(bayesplot)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
              prob_outer = 0.95) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
  xaxis_text(angle = 90, hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.y = element_blank(),
        axis.text.y = element_blank())+
  coord_flip() 

Negative Binomial

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, \sigma_y & \sim \text{NegBin}(\mu_{ij}, r) \\ & \text{where} \log(\mu_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c}, \beta_k & \stackrel{ind}{\sim} N(0,1) \\ \sigma_y, \sigma_0 & \stackrel{ind}{\sim} Exp(1) \end{split}\]

negbin_hierarchical <- stan_glmer(
 white_count ~  total_pop + mean_income + mean_rent + 
    unemployment_count + sub_count + 
    perc_covered_by_transit + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count + (1 | nta_id),
  data = nyc_clean,
  family = neg_binomial_2,
  chains = 4, iter = 500*2, seed = 84735, refresh = 0)
check_2 <- pp_check(negbin_hierarchical) + 
  xlab("White Resident Count") +
  xlim(0,100000)+
  labs(title = "Negative Binomial")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

tidy(negbin_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.95)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 2153.3432562 0.4124044 989.4476079 4798.3913302
total_pop 0.0062732 0.0000098 0.0045646 0.0081272
eviction_count -0.2574631 0.0004905 -0.3540952 -0.1618246
uninsured_count -0.0183541 0.0000552 -0.0288213 -0.0071493
nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_negbin <-  posterior_predict(
  negbin_hierarchical, newdata = nyc_predict_clean)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
              prob_outer = 0.95) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
    xaxis_text(angle = 90,  hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.y = element_blank(),
        axis.text.y = element_blank())+
  coord_flip()